In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
dataset = pd.read_csv('train.csv')
In [3]:
dataset.head()
Out[3]:
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
In [4]:
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
Data Cleaning¶
In [5]:
dataset.isnull().sum().sum()
Out[5]:
6965
In [6]:
dataset.drop(columns=['Id'],inplace=True)
dataset.drop(columns=['PoolQC'],inplace=True)
In [7]:
dataset['FireplaceQu'].value_counts()
Out[7]:
FireplaceQu Gd 380 TA 313 Fa 33 Ex 24 Po 20 Name: count, dtype: int64
In [8]:
dataset['BsmtExposure'].replace('No', 'No_Exposure', inplace=True)
C:\Users\omdes\AppData\Local\Temp\ipykernel_10224\522399936.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
dataset['BsmtExposure'].replace('No', 'No_Exposure', inplace=True)
In [9]:
l = ['BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2','GarageCond','GarageQual','BsmtExposure', 'GarageFinish','Alley','Fence','MiscFeature','FireplaceQu','GarageType']
for i in l:
a = 'NO_' + i
dataset[i].fillna(a, inplace=True)
C:\Users\omdes\AppData\Local\Temp\ipykernel_10224\2723856173.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
dataset[i].fillna(a, inplace=True)
In [10]:
dataset['GarageYrBlt'].describe()
Out[10]:
count 1379.000000 mean 1978.506164 std 24.689725 min 1900.000000 25% 1961.000000 50% 1980.000000 75% 2002.000000 max 2010.000000 Name: GarageYrBlt, dtype: float64
In [11]:
m=dataset['MasVnrArea'].mean()
dataset['MasVnrArea'].fillna(value=m,inplace=True)
C:\Users\omdes\AppData\Local\Temp\ipykernel_10224\1282548748.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
dataset['MasVnrArea'].fillna(value=m,inplace=True)
In [12]:
dataset['GarageType'].fillna(value='NO_Garage',inplace=True)
In [13]:
m=dataset['GarageYrBlt'].mean()
dataset['GarageYrBlt'].fillna(value=m,inplace=True)
C:\Users\omdes\AppData\Local\Temp\ipykernel_10224\2200549702.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
dataset['GarageYrBlt'].fillna(value=m,inplace=True)
In [14]:
dataset['LotFrontage'].fillna(value=0,inplace=True)
C:\Users\omdes\AppData\Local\Temp\ipykernel_10224\468638783.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
dataset['LotFrontage'].fillna(value=0,inplace=True)
In [15]:
dataset.dropna(subset=['MasVnrType'],inplace=True)
In [16]:
dataset.dropna(subset=['Electrical'], inplace=True)
In [17]:
dataset.isnull().sum().sum()
Out[17]:
0
In [18]:
from sklearn.preprocessing import LabelEncoder
In [19]:
le=LabelEncoder()
col= dataset.columns
for i in col:
if dataset[i].dtype=='object':
dataset[i]=le.fit_transform(dataset[i])
Data visualization¶
In [24]:
numeric_cols = dataset.select_dtypes(include=['int64', 'float64']).columns
num_plots = len(numeric_cols)
fig, axes = plt.subplots(nrows=num_plots, ncols=1, figsize=(10, num_plots * 5))
# Iterate through numeric columns and plot boxplots
for i, col in enumerate(numeric_cols):
sns.boxplot(x=dataset[col], ax=axes[i])
axes[i].set_title(f'Boxplot of {col}')
axes[i].set_ylabel('')
axes[i].set_xlabel(col)
# Adjust layout
plt.tight_layout()
plt.show()
In [26]:
plt.figure(figsize=(100,100))
sns.heatmap(dataset.corr(),annot=True)
#plt.savefig('new.jpg')
plt.show()
Spliting the Data¶
In [20]:
x=dataset.iloc[:,:-1]
y=dataset['SalePrice']
In [21]:
from sklearn.model_selection import train_test_split
In [22]:
from sklearn.linear_model import LinearRegression
In [23]:
for i in range(0,50):
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=i)
lr=LinearRegression()
lr.fit(x_train,y_train)
print(lr.score(x_test,y_test)*100 ,lr.score(x_train,y_train)*100,i)
86.47330229791923 84.10483134384816 0 79.88152984394141 85.10822246085813 1 87.38288925893183 83.74364321466949 2 76.93473000432358 86.04087189474987 3 64.91325276783641 88.12859146140312 4 85.81455070823033 84.11729837300938 5 86.52002812452464 84.12626232846658 6 64.03752598381196 87.53427767857502 7 87.12935274209269 83.87421940931887 8 87.07765000619102 83.8741742268535 9 59.07539058827744 87.8488627766987 10 87.84385099979836 83.91047427725411 11 79.76638561893378 85.45345544816179 12 59.9824205327973 88.06164644369268 13 81.5551739682284 84.83254725352835 14 85.30287077270754 84.38723323930544 15 87.89394612196676 83.66421369210117 16 76.08971566360276 86.03341719620015 17 85.98345973775426 84.12106516206612 18 84.05144832356197 84.49950472488524 19 87.82751199023474 83.59111917765183 20 85.13300986315964 84.17248607344823 21 78.26352253277004 85.8415813162837 22 64.76659011762702 88.708530157837 23 63.9408005662218 87.9883611433164 24 88.89042462551889 83.68109835418468 25 51.33177343604266 87.95759957371457 26 83.6221316612232 84.55232093620229 27 59.851129743049725 87.8808656210925 28 72.30274825437557 86.32538433904642 29 80.2470392023704 85.25404061935316 30 62.804896715507795 87.54457703220027 31 80.1957862369642 85.35081949976863 32 84.94910574647024 84.26862256804347 33 86.81555718677441 83.98219066448522 34 73.33490314360259 86.89598560842927 35 79.41278497144695 85.77836558755274 36 81.52423880505336 84.85778590797102 37 85.28713162919874 84.34761578325069 38 84.88023581609512 84.51405095849856 39 85.08869221503002 84.34142920149644 40 51.250289826999776 90.55594904908023 41 85.9528813250527 84.22113559443956 42 85.38306428648224 84.17090937680783 43 85.67305271629586 84.06872135671271 44 79.57967706094664 85.70739670808919 45 88.64377641406878 83.48453092036794 46 86.53821426488622 83.81821064913882 47 68.63956582626258 86.15832493965819 48 58.931182028644315 88.33174781419751 49
In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=25)
Model Taining¶
Linear Regression¶
In [25]:
lr=LinearRegression()
lr.fit(x_train,y_train)
Out[25]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [26]:
print(lr.score(x_test,y_test)*100 ,lr.score(x_train,y_train)*100)
88.89042462551889 83.68109835418468
In [32]:
# a=list(dataset.columns[5:15])
# a.append('SalePrice')
In [33]:
# plt.figure(figsize=(20,20))
# sns.heatmap(dataset[a].corr(),annot=True)
# plt.show()
Decision Tree Regressor¶
In [27]:
from sklearn.tree import DecisionTreeRegressor
In [28]:
dt=DecisionTreeRegressor(max_depth=10,criterion='absolute_error',splitter='best')
dt.fit(x_train,y_train)
dt.score(x_test,y_test)*100 ,dt.score(x_train,y_train)*100
Out[28]:
(72.49705083863368, 97.46803434539068)
In [36]:
# (77.23414771776245, 99.94543820951287)
#(79.8309960906415, 99.81163848432743)
#(68.42982568958558, 99.73093260761057)
RandomForestRegressor¶
In [32]:
from sklearn.ensemble import RandomForestRegressor
In [33]:
rf=RandomForestRegressor(n_estimators=30)
rf.fit(x_train,y_train)
rf.score(x_test,y_test)*100,rf.score(x_train,y_train)*100
Out[33]:
(89.31331844344814, 97.82422679590651)
finding the best parameters¶
In [29]:
from sklearn.model_selection import GridSearchCV
In [30]:
from sklearn.tree import plot_tree
In [42]:
plt.figure(figsize=(50,50))
plot_tree(dt)
plt.savefig('demo.jpg')
plt.show()
In [31]:
df1={'criterion':["squared_error","friedman_mse", "poisson", "absolute_error"], #
'splitter':["best", "random"],
'max_depth':[i for i in range(2,20)]}
In [44]:
gs=GridSearchCV(DecisionTreeRegressor(),param_grid=df1)
gs.fit(x_train,y_train)
Out[44]:
GridSearchCV(estimator=DecisionTreeRegressor(),
param_grid={'criterion': ['squared_error', 'absolute_error'],
'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19],
'splitter': ['best', 'random']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=DecisionTreeRegressor(),
param_grid={'criterion': ['squared_error', 'absolute_error'],
'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19],
'splitter': ['best', 'random']})DecisionTreeRegressor(criterion='absolute_error', max_depth=6)
DecisionTreeRegressor(criterion='absolute_error', max_depth=6)
In [45]:
gs.best_params_
Out[45]:
{'criterion': 'absolute_error', 'max_depth': 6, 'splitter': 'best'}
In [46]:
gs.score(x_test,y_test)*100 ,gs.score(x_train,y_train)*100
Out[46]:
(77.62778638010373, 87.6331897154222)
In [34]:
df2 = {'criterion':["squared_error", "friedman_mse", "absolute_error","poisson"],
'n_estimators':[100]
}
In [47]:
gs2=GridSearchCV(RandomForestRegressor(),param_grid=df2)
gs2.fit(x_train,y_train)
Out[47]:
GridSearchCV(estimator=RandomForestRegressor(),
param_grid={'criterion': ['squared_error', 'friedman_mse',
'absolute_error', 'poisson'],
'n_estimators': [100]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=RandomForestRegressor(),
param_grid={'criterion': ['squared_error', 'friedman_mse',
'absolute_error', 'poisson'],
'n_estimators': [100]})RandomForestRegressor(criterion='absolute_error')
RandomForestRegressor(criterion='absolute_error')
In [48]:
gs2.score(x_test,y_test)*100,gs2.score(x_train,y_train)*100
Out[48]:
(90.083546996203, 97.90819580716888)
In [49]:
gs2.best_params_
Out[49]:
{'criterion': 'absolute_error', 'n_estimators': 100}
Using the best paramaters in Voting Regressor¶
In [35]:
from sklearn.ensemble import VotingRegressor
In [36]:
li=[('lr',LinearRegression()),('rf',RandomForestRegressor(criterion='absolute_error', n_estimators=100))]
In [37]:
vr=VotingRegressor(li,n_jobs=50)
vr.fit(x_train,y_train)
Out[37]:
VotingRegressor(estimators=[('lr', LinearRegression()),
('rf',
RandomForestRegressor(criterion='absolute_error'))],
n_jobs=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingRegressor(estimators=[('lr', LinearRegression()),
('rf',
RandomForestRegressor(criterion='absolute_error'))],
n_jobs=50)LinearRegression()
RandomForestRegressor(criterion='absolute_error')
In [42]:
vr.score(x_test,y_test)*100,vr.score(x_train,y_train)*100
#90.31498422854605, 93.1039525556599
Out[42]:
(92.05456381904091, 93.17519744717606)
In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = vr.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
Mean Absolute Error (MAE): 15010.202119001191 Mean Squared Error (MSE): 417364736.750932 Root Mean Squared Error (RMSE): 20429.506522452568 R-squared (R²): 0.920545638190409